Read income data

# ::::: read income data :::::
incex <- read.table('/Users/youmisuk/Desktop/DS3003/W1 - Syllabus & R intro/income_exmpl.dat', header = T, sep = "\t")
summary(incex)
     sex                 age            edu                occ           
 Length:1922        Min.   :18.00   Length:1922        Length:1922       
 Class :character   1st Qu.:31.00   Class :character   Class :character  
 Mode  :character   Median :42.00   Mode  :character   Mode  :character  
                    Mean   :41.69                                        
                    3rd Qu.:52.00                                        
                    Max.   :65.00                                        
      oexp          income    
 Min.   : 0.0   Min.   : 704  
 1st Qu.: 8.0   1st Qu.:1117  
 Median :19.0   Median :1304  
 Mean   :19.5   Mean   :1313  
 3rd Qu.:30.0   3rd Qu.:1506  
 Max.   :48.0   Max.   :2115  
# change order of factor levels
incex$occ <- factor(incex$occ, levels = c('low', 'med.', 'high'))
incex$edu <- factor(incex$edu, levels = c('low', 'med.', 'high'))
incex$sex <- factor(incex$sex, levels = c('m', 'f'), labels = c('male', 'female'))

attach(incex) # attach data frame (facilitates access to variables)

Barplot

- Simple barplots and pie chart

tab <- table(occ, sex)   # table we want to plot
tab
      sex
occ    male female
  low   223    372
  med.  394    306
  high  452    175
par(mfcol = c(2,2))   # set up multiple frames for plotting (2 rows, 2 columns; filled by columns)
op <- par(mar = c(4.5, 4, 3, 0))  # change the margin of the plot (see help(par))
                                  # old parameter values are stored under "op"
barplot(tab[, 1], xlab = 'Occupational status', ylab = 'Frequency', 
        col = c('grey90', 'grey60', 'grey30'), main = 'Men')
barplot(tab[, 2], xlab = 'Occupational status', ylab = 'Frequency', 
        col = c('grey90', 'grey60', 'grey30'), main = 'Women')
par(mar = c(2, 1, 2, 0))
pie(tab[, 1], col = c('grey90', 'grey60', 'grey30'), main = 'Men')
pie(tab[, 2], col = c('grey90', 'grey60', 'grey30'), main = 'Women')

par(op)   # sets parameter values back to old values (ie margins as before)

- Stacked barplot

par(mfrow = c(1, 2))  # new multiple frame plot (1 row, 2 columns)
barplot(tab, xlab = 'gender', ylab = 'Frequency', 
        col = c('grey90', 'grey60', 'grey30'), main = 'Occupational status')
text(rep(.7, 3), (cumsum(c(0, tab[-3, 1])) + cumsum(tab[, 1]))/2, 
     dimnames(tab)$occ, col = c('black', 'white', 'white'))
text(rep(1.9, 3), (cumsum(c(0, tab[-3, 2])) + cumsum(tab[, 2]))/2, 
     dimnames(tab)$occ, col = c('black', 'white', 'white'))
ptab <- prop.table(tab, 2) 
barplot(ptab, xlab = 'gender', ylab = 'Frequency', 
        col = c('grey90', 'grey60', 'grey30'), main = 'Occupational status')
text(rep(.7, 3), (cumsum(c(0, ptab[-3, 1])) + cumsum(ptab[, 1]))/2, 
     dimnames(ptab)$occ, col = c('black', 'white', 'white'))
text(rep(1.9, 3), (cumsum(c(0, ptab[-3, 2])) + cumsum(ptab[, 2]))/2, 
     dimnames(ptab)$occ, col = c('black', 'white', 'white'))

?text

- Frequency tables & frequency distribution

head(f <- table(age))      # simple frequency table of age
age
18 19 20 21 22 23 
12  8 15 18 29 31 
                       # putting the assignment into parentheses assigns & 
                       # simultan. prints the result of expression on the screen
head(p <- f / sum(f))      # relative distribution (proportions)
age
         18          19          20          21          22          23 
0.006243496 0.004162331 0.007804370 0.009365245 0.015088450 0.016129032 
head(p <- prop.table(f))   # alternative way for getting proportions: prop.table()
age
         18          19          20          21          22          23 
0.006243496 0.004162331 0.007804370 0.009365245 0.015088450 0.016129032 
head(cf <- cumsum(f))      # cumulative frequencies
 18  19  20  21  22  23 
 12  20  35  53  82 113 
head(cp <- cumsum(p))      # cumulative proportions
         18          19          20          21          22          23 
0.006243496 0.010405827 0.018210198 0.027575442 0.042663892 0.058792924 
# create a complete frequency/distribution table using column-bind (cbind()) 
# cbind(freq = f, prop = p, cum.freq = cf, cum.prop = cp)  
head(cbind(freq = f, prop = p, cum.freq = cf, cum.prop = cp), 10)
   freq        prop cum.freq    cum.prop
18   12 0.006243496       12 0.006243496
19    8 0.004162331       20 0.010405827
20   15 0.007804370       35 0.018210198
21   18 0.009365245       53 0.027575442
22   29 0.015088450       82 0.042663892
23   31 0.016129032      113 0.058792924
24   38 0.019771072      151 0.078563996
25   62 0.032258065      213 0.110822060
26   48 0.024973985      261 0.135796046
27   50 0.026014568      311 0.161810614
# ::::: write a simple function producing a complete frequency table  :::::
freq.tab <- function(x)
{
   # produces a frequency table for a vector x
   f <- table(x)
   p <- prop.table(f)
   cbind(freq = f, prop = p, cum.freq = cumsum(f), cum.prop = cumsum(p)) 
}  

# freq.tab(age)
head(freq.tab(age), 10)
   freq        prop cum.freq    cum.prop
18   12 0.006243496       12 0.006243496
19    8 0.004162331       20 0.010405827
20   15 0.007804370       35 0.018210198
21   18 0.009365245       53 0.027575442
22   29 0.015088450       82 0.042663892
23   31 0.016129032      113 0.058792924
24   38 0.019771072      151 0.078563996
25   62 0.032258065      213 0.110822060
26   48 0.024973985      261 0.135796046
27   50 0.026014568      311 0.161810614
# ::::: frequency distribution of "age" :::::
# by defining "age" as factor plot() produces a strip chart
# the arguments "space" controls the space between bars
plot(as.factor(age), space = 2, ylab = 'Frequency', xlab = 'Age',
     main = 'Age Distribution')

Histogram

# ::::: creating grouped data from "age" :::::
head(age5 <- cut(age, seq(15, 65, by = 5)), 20)    # cuts the data into 5y age groups
 [1] (60,65] (30,35] (55,60] (60,65] (15,20] (35,40] (35,40] (50,55] (45,50]
[10] (50,55] (50,55] (50,55] (20,25] (55,60] (35,40] (55,60] (30,35] (45,50]
[19] (30,35] (55,60]
10 Levels: (15,20] (20,25] (25,30] (30,35] (35,40] (40,45] (45,50] ... (60,65]
head(age10 <- cut(age, seq(15, 65, by = 10)), 20)  # cuts the data into 10y age groups
 [1] (55,65] (25,35] (55,65] (55,65] (15,25] (35,45] (35,45] (45,55] (45,55]
[10] (45,55] (45,55] (45,55] (15,25] (55,65] (35,45] (55,65] (25,35] (45,55]
[19] (25,35] (55,65]
Levels: (15,25] (25,35] (35,45] (45,55] (55,65]
(tab5 <- table(age5))                      # frequency distribution of age5
age5
(15,20] (20,25] (25,30] (30,35] (35,40] (40,45] (45,50] (50,55] (55,60] (60,65] 
     35     178     235     237     221     227     230     233     224     102 
(tab10 <- table(age10))                    # frequency distribution of age10
age10
(15,25] (25,35] (35,45] (45,55] (55,65] 
    213     472     448     463     326 
# use our freq.tab() function for investigating the frequency distribution
# of the grouped data
freq.tab(age5)
        freq       prop cum.freq  cum.prop
(15,20]   35 0.01821020       35 0.0182102
(20,25]  178 0.09261186      213 0.1108221
(25,30]  235 0.12226847      448 0.2330905
(30,35]  237 0.12330905      685 0.3563996
(35,40]  221 0.11498439      906 0.4713840
(40,45]  227 0.11810614     1133 0.5894901
(45,50]  230 0.11966701     1363 0.7091571
(50,55]  233 0.12122789     1596 0.8303850
(55,60]  224 0.11654527     1820 0.9469303
(60,65]  102 0.05306972     1922 1.0000000
freq.tab(age10)
        freq      prop cum.freq  cum.prop
(15,25]  213 0.1108221      213 0.1108221
(25,35]  472 0.2455775      685 0.3563996
(35,45]  448 0.2330905     1133 0.5894901
(45,55]  463 0.2408949     1596 0.8303850
(55,65]  326 0.1696150     1922 1.0000000
# ::::: plot these data is using hist() :::::
par(mfrow = c(2,2))
# frequency plots
hist(age, seq(15, 65, by = 5), xlab = 'Age', main = '', col = 'grey90')
hist(age, seq(15, 65, by = 10), xlab = 'Age', main = '', col = 'grey90')
# density plots (prob = T)
hist(age, seq(15, 65, by = 5), xlab = 'Age', main = '', col = 'grey90', prob = T)
hist(age, seq(15, 65, by = 10), xlab = 'Age', main = '', col = 'grey90', prob = T)

Kernel Density Estimation

# ::::: write our own function kden() :::::
kden <- function(x, h, x.val = seq(min(x)-2*h, max(x)+2*h, length = 200)) {

   # kernel density with uniform (rectangular) kernel
   # calculates density at deliberately chosen values x.val
   # x     ... numeric data vector
   # h     ... bandwidth of uniform kernel
   # x.val ... numeric vector of values at which the density should be estimated

   x.val <- sort(unique(x.val))  # sorts the unique values of x
   n <- length(x)                # number of observations

   # define a kernel function dens.f() inside the kden() function
   dens.f <- function(x.v) {             # fct. computes the density at x.v      
      z <- (x.v - x) / h                 # standardization
      d <- ifelse(z > -1 & z < 1, .5, 0) # computes kernel density, either .5 or 0
      sum(d) / n / h                     # compute density
   }
   dens <- sapply(x.val, dens.f)
   data.frame(x = x.val, y = dens)  # combine output into a data frame and returns it          
}

# compute the kernel density estimate for income with a bandwidth of 100 EUR
den <- kden(incex$income, h = 100)  
head(den, 10)
          x y
1  504.0000 0
2  513.1005 0
3  522.2010 0
4  531.3015 0
5  540.4020 0
6  549.5025 0
7  558.6030 0
8  567.7035 0
9  576.8040 0
10 585.9045 0
den$x <- den$x - 100 # for a correct step function (use oexp to clearly see it)
plot(den, type = 's', lwd = 3, xlab = 'Income', ylab = 'Density', 
     main = 'Income Distribution')
rug(incex$income, ticksize = .02)   # adds a "rug" plot

# ::::: use built-in function in R: density() :::::
plot(density(incex$income), main = 'Income Distribution')

# combination of histogram and kernel density estimation
hist(incex$income, prob = T, xlab = 'Income', main = 'Income Distribution')
lines(density(incex$income), lwd = 3, col = 'red')

Boxplots

# ::::: computation of quantiles :::::
quantile(age, prob = c(0, .25, .50, .75, 1))   # quartiles
  0%  25%  50%  75% 100% 
  18   31   42   52   65 
quantile(age, prob = seq(.0, 1, by = .2))      # quintiles
  0%  20%  40%  60%  80% 100% 
  18   29   37   46   54   65 
quantile(age, prob = seq(.0, 1, by = 1/7))     # septiles
       0% 14.28571% 28.57143% 42.85714% 57.14286% 71.42857% 85.71429%      100% 
       18        27        33        39        45        51        57        65 
quantile(age, prob = seq(.0, 1, by = .1))      # deciles
  0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
  18   25   29   33   37   42   46   50   54   58   65 
par(mfrow=c(1,2))
boxplot(age, horizontal = T, main="Age")
boxplot(income, horizontal = T, main="Income")

boxplot(income ~ edu + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Income', names = c('male\nlow', 'male\nmed.', 'male\nhigh', 'female\nlow', 'female\nmed.', 'female\nhigh'), main = 'Income by gender and educational level')

# boxplot with notches and variable width
# notches indicate a 95% confidence interval for the median
# the variable width of the boxes reflect differences in sample size
boxplot(income ~ edu + sex, col = rep(c('blue', 'red'), each = 3), ylab = 'Income', names = c('male\nlow', 'male\nmed.', 'male\nhigh', 'female\nlow', 'female\nmed.', 'female\nhigh'), main = 'Income by gender and educational level', notch = T, varwidth = T)

Quantile-comparison plots

(A) Comparison of densities

# way 1
curve(dnorm, -4, 4, lwd = 3, bty = 'L', ylab = 'Density', 
   xlab = 'Standard Normal Distr. / Income (Standardized)')   # theoretical distribution (standard normal)
lines(density(scale(incex$income)), col = 'red', lwd = 3, lty = 2)
legend('topright', c('Income', 'Normal Distr.'), bty = 'n',
   col = c('red', 'black'), lty = c(2, 1), lwd = 3)

# way 2
plot(density(scale(incex$income)), col = 'red', lwd = 3, lty = 2, xlim=c(-4, 4), ylim=c(0, 0.4), main="",
  bty = 'L', ylab = 'Density', xlab = 'Standard Normal Distr. / Income (Standardized)')
curve(dnorm, -4, 4, lwd = 3, add=T)   # theoretical distribution (standard normal)
legend('topright', c('Income', 'Normal Distr.'), bty = 'n',
   col = c('red', 'black'), lty = c(2, 1), lwd = 3)

(B) Quantile-comparison plots

my own function, qq

# ::::: writing our own function qq() :::::
qq <- function(x, ...) 
{
   # computes empirical and normal quantiles for a numeric vector x

   n <- length(x)        # number of observations
   x <- sort(x)          # rank-ordered data
   eq <- scale(x)        # standardized empirical quantiles
   p <- (1:n - .5) / n   # cumulative proportions
   tq <- qnorm(p)        # theoretical (normal) quantiles
   
   plot(tq, x, main = 'Quantile-Comparison Plot', ...)
   lines(tq, mean(x) + tq * sd(x), lwd = 2)

   invisible(data.frame(x = x, eq = eq, p = p, tq = tq))  # returns a data frame
}

qq.dat <- qq(income)
head(qq.dat)      # first 6 quantiles
    x        eq            p        tq
1 704 -2.383548 0.0002601457 -3.470086
2 715 -2.340506 0.0007804370 -3.163121
3 720 -2.320941 0.0013007284 -3.011284
4 726 -2.297463 0.0018210198 -2.907609
5 727 -2.293550 0.0023413111 -2.828093
6 732 -2.273986 0.0028616025 -2.763232
tail(qq.dat)      # last 6 quantiles
        x       eq         p       tq
1917 1983 2.621101 0.9971384 2.763232
1918 2002 2.695447 0.9976587 2.828093
1919 2025 2.785445 0.9981790 2.907609
1920 2033 2.816748 0.9986993 3.011284
1921 2071 2.965440 0.9992196 3.163121
1922 2115 3.137609 0.9997399 3.470086
# QQ-plot with standardized empirical quantiles
# NOTE: plots only differ in the scale of the y-axis
plot(qq.dat$tq, qq.dat$eq)  
abline(a = 0, b = 1)        # add reference line

qqPlot from package car

par(mfrow=c(1,3))
# ::::: use the qqPlot() function from the car library :::::
library(car)
Loading required package: carData
qqPlot(income, col = 1, cex = .7)  
[1] 1493 1190
qqPlot(income, col = 1, cex = .7, distribution = "chisq", df=3)  
[1] 1493 1190
qqPlot(rchisq(1000, df=3), col = 1, cex = .7, distribution = "chisq", df=3)  

[1] 101 675

qqnorm

qqnorm(income)            # standard QQ-plot from the "stats" package

Scatterplots

- Simple scatterplot of “age” and “occupational experience”

  • without and with jittering
par(mfrow=c(1,3))
# :::: simple scatterplot of "age" and "occupational experience" :::::
par(mar = c(4.5, 4.5, 1, 1))
plot(age, oexp, xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16)
plot(jitter(age), jitter(oexp), xlab = 'Age', ylab = 'Occupational Experience', 
     cex = .4, pch = 16)   # with default jittering
plot(jitter(age, factor = 3), jitter(oexp, factor = 3),    # more jittering: factor = 3
     xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16)

  • changing the degree of transparency
par(mfrow=c(1,3))
for (i in c(0.2, 0.5, 0.8)) {
  plot(jitter(incex$age, factor = 3), jitter(incex$oexp, factor = 3),
       xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16, 
       col=rgb(red = 0, green = 0, blue = 0, alpha = i), main=paste("alpha =", i))
}

library(scales)
par(mfrow=c(1,3))
for (i in c(0.2, 0.5, 0.8)) {
  plot(jitter(incex$age, factor = 3), jitter(incex$oexp, factor = 3),
       xlab = 'Age', ylab = 'Occupational Experience', cex = .4, pch = 16, 
       col=alpha("black", i), main=paste("alpha =", i))
}

- Simple scatterplot of “occ” and “occupational experience”

# for categorical variable (x-axis)
plot(oexp, occ, 
     xlab = 'Occupational Experience', ylab = 'Occupational Status',  
     cex = .4, pch = 16)

plot(oexp, occ, 
     xlab = 'Occupational Experience', ylab = 'Occupational Status',  
     cex = .4, pch = 16, yaxt = 'n')
axis(2, 1:3, levels(occ))

plot(jitter(oexp, factor = 3), jitter(as.numeric(occ), factor = 2), 
     xlab = 'Occupational Experience', ylab = 'Occupational Status',  
     cex = .4, pch = 16, yaxt = 'n')
axis(2, 1:3, levels(occ))

par(mfrow=c(1,2))
plot(jitter(as.numeric(edu), factor = 2), jitter(as.numeric(occ), factor = 2), 
     xlab = 'Educational Level', ylab = 'Occupational Status',  
     cex = .4, pch = 16, xaxt = 'n', yaxt = 'n')
axis(1, 1:3, levels(edu))
axis(2, 1:3, levels(occ))

boxplot(oexp ~ occ, data = incex, horizontal = T, 
   xlab = 'Occupational Experience', ylab = 'Occupational Status')

- Scatterplot-matrix

plot(incex[, c('income', 'oexp', 'age', 'edu')], cex = .2)

# from the "car"-package
library(car)
scatterplotMatrix(incex[, c('income', 'oexp', 'age', 'edu')], cex = .3)

- Conditioning plots

# coplot()
coplot(income ~ oexp | edu, data = incex)

coplot(income ~ oexp | edu * occ, data = incex, cex = .6)

# xyplot() from the "lattice"-package
library(lattice)

Attaching package: 'lattice'
The following object is masked _by_ '.GlobalEnv':

    qq

xyplot(income ~ oexp, data = incex)

xyplot(income ~ oexp | edu, data = incex)

xyplot(income ~ oexp | edu * occ, data = incex, cex = .4)

- Scatterplots with color and shape separators

# color separator
plot(incex$age, incex$oexp, xlab = 'Age', ylab = 'Occupational
     Experience', cex = .4, pch = 16, col=c("red", "green")[as.numeric(sex)])
legend('topleft', c('Male', 'Female'), bty = 'n',
      col = c('red', 'green'), pch = c(16, 16))

# shape separator
plot(incex$age, incex$oexp, xlab = 'Age', ylab = 'Occupational
     Experience', cex = .4, pch = c(1, 3)[as.numeric(sex)])
legend('topleft', c('Male', 'Female'), bty = 'n', pch = c(1, 3))

- 3-dim scatterplot

cloud(income ~ oexp * sex, data = incex, col = as.numeric(sex)+1)

detach(incex)   # detaches the attached data frame "incex"